From f683335fb0d06aef7170fac523ae2ab83173a7de Mon Sep 17 00:00:00 2001
From: Loren Merritt <pengvado@akuvian.org>
Date: Thu, 25 Apr 2013 17:50:53 +0000
Subject: [PATCH] Optimize gamma correction

Switch from Chebyshev polynomial to Newton's method, which is both simpler and
1.5x faster for the same precision.
---
 babl/base/pow-24.c | 205 +++++++++++----------------------------------
 1 file changed, 51 insertions(+), 154 deletions(-)

diff --git a/babl/base/pow-24.c b/babl/base/pow-24.c
index 50d8d87..401a102 100644
--- a/babl/base/pow-24.c
+++ b/babl/base/pow-24.c
@@ -1,5 +1,5 @@
 /* babl - dynamically extendable universal pixel conversion library.
- * Copyright (C) 2012, Red Hat, Inc.
+ * Copyright (C) 2013 Loren Merritt
  *
  * This library is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
@@ -16,170 +16,67 @@
  * <http://www.gnu.org/licenses/>.
  */
 
-#include <stdlib.h>
 #include <math.h>
 #include "util.h"
 
-/**
- * This implements pow(x, 2.4) and pow(x, 1/2.4) using a chebyshev based polynominal approximation.
- * This is based on the approach in:
- * http://stackoverflow.com/questions/6475373/optimizations-for-pow-with-const-non-integer-exponent/6478839#6478839
+/* a^b = exp(b*log(a))
  *
- * The code includes checbychev constants for the 9th order, which seems to give a max error
- * of about 4e-09, but unless you define USE_MAX_POW24_ACCURACY the order has been limited to
- * give an error of about 1e-7 (i.e. single precission floats).
- */
- 
-/* Chebychev polynomial terms for x^(5/12) expanded around x=1.5
- * Non-zero terms calculated via
- * NIntegrate[(2/Pi)*ChebyshevT[N,u]/Sqrt [1-u^2]*((u+3)/2)^(5/12),{u,-1,1}, PrecisionGoal->20, WorkingPrecision -> 100]
- * Zeroth term is similar except it uses 1/pi rather than 2/pi.
+ * Extracting the exponent from a float gives us an approximate log.
+ * Or better yet, reinterpret the bitpattern of the whole float as an int.
+ *
+ * However, the output values of 12throot vary by less than a factor of 2
+ * over the domain we care about, so we only get log() that way, not exp().
+ *
+ * Approximate exp() with a low-degree polynomial; not exactly equal to the
+ * Taylor series since we're minimizing maximum error over a certain finite
+ * domain. It's not worthwhile to use lots of terms, since Newton's method
+ * has a better convergence rate once you get reasonably close to the answer.
  */
-static const double Cn[] = { 
-  1.1758200232996901923,
-  0.16665763094889061230,
-  -0.0083154894939042125035,
-  0.00075187976780420279038,
-  -0.000083240178519391795367,
-  0.000010229209410070008679,
-  -1.3401001466409860246e-6,
-  1.8333422241635376682e-7,
-  -2.5878596761348859722e-8
-};
-
-
-/* Returns x^(/12) for x in [1,2) */
-static double pow512norm (double x)
+static inline double
+init_newton (double x, double exponent, double c0, double c1, double c2)
 {
-   double Tn[9];
-   double u;
-
-   u = 2.0*x - 3.0;
-   Tn[0] = 1.0;
-   Tn[1] = u;
-   Tn[2] = 2*u*Tn[2-1] - Tn[2-2];
-   Tn[3] = 2*u*Tn[3-1] - Tn[3-2];
-   Tn[4] = 2*u*Tn[4-1] - Tn[4-2];
-   Tn[5] = 2*u*Tn[5-1] - Tn[5-2];
-   Tn[6] = 2*u*Tn[6-1] - Tn[6-2];
-#ifdef USE_MAX_POW24_ACCURACY
-   Tn[7] = 2*u*Tn[7-1] - Tn[7-2];
-   Tn[8] = 2*u*Tn[8-1] - Tn[8-2];
-#endif
-
-   return Cn[0]*Tn[0] + Cn[1]*Tn[1] + Cn[2]*Tn[2] + Cn[3]*Tn[3] + Cn[4]*Tn[4] + Cn[5]*Tn[5] + Cn[6]*Tn[6]
-#ifdef USE_MAX_POW24_ACCURACY
-     + Cn[7]*Tn[7] + Cn[8]*Tn[8]
-#endif
-     ;
+    int iexp;
+    double y = frexp(x, &iexp);
+    y = 2*y+(iexp-2);
+    c1 *= M_LN2*exponent;
+    c2 *= M_LN2*M_LN2*exponent*exponent;
+    return y = c0 + c1*y + c2*y*y;
 }
 
-/* Precalculated (2^N) ^ (5 / 12) */
-static const double pow2_512[12] = {
-  1.0,
-  1.3348398541700343678,
-  1.7817974362806785482,
-  2.3784142300054420538,
-  3.1748021039363991669,
-  4.2378523774371812394,
-  5.6568542494923805819,
-  7.5509945014535482244,
-  1.0079368399158985525e1,
-  1.3454342644059433809e1,
-  1.7959392772949968275e1,
-  2.3972913230026907883e1
-};
-
-
-/* Returns x^(1/2.4) == x^(5/12) */
-double babl_pow_1_24 (double x)
-{
-   double s;
-   int iexp;
-   div_t qr;
-
-   s = frexp (x, &iexp);
-   s *= 2.0;
-   iexp -= 1;
-
-   qr = div (iexp, 12);
-   if (qr.rem < 0) {
-      qr.quot -= 1;
-      qr.rem += 12;
-   }
-
-   return ldexp (pow512norm (s) * pow2_512[qr.rem], 5 * qr.quot);
-}
-
-/* Chebychev polynomial terms for x^(7/5) expanded around x=1.5
- * Non-zero terms calculated via
- * NIntegrate[(2/Pi)*ChebyshevT[N,u]/Sqrt [1-u^2]*((u+3)/2)^(7/5),{u,-1,1}, PrecisionGoal->20, WorkingPrecision -> 100]
- * Zeroth term is similar except it uses 1/pi rather than 2/pi.
+/* Returns x^2.4 == (x*(x^(-1/5)))^3, using Newton's method for x^(-1/5).
+ * Max absolute error is 4e-8.
+ * One more iteration of Newton would bring error down to 4e-15.
+ *
+ * (The above measurements apply to the input range can that be involved in
+ * gamma correction. Accuracy for inputs very close to 0 is somewhat worse,
+ * but that will hit the linear part of the gamma curve rather than call these.
+ * Out-of-range pixels >1.3 are also somewhat worse.)
  */
-static const double iCn[] = {
-  1.7917488588043277509,
-  0.82045614371976854984,
-  0.027694100686325412819,
-  -0.00094244335181762134018,
-  0.000064355540911469709545,
-  -5.7224404636060757485e-6,
-  5.8767669437311184313e-7,
-  -6.6139920053589721168e-8,
-  7.9323242696227458163e-9
-};
-
-/* Returns x^(7/5) for x in [1,2) */
-static double pow75norm (double x)
+double
+babl_pow_24 (double x)
 {
-   double Tn[9];
-   double u;
-
-   u = 2.0*x - 3.0;
-   Tn[0] = 1.0;
-   Tn[1] = u;
-   Tn[2] = 2*u*Tn[2-1] - Tn[2-2];
-   Tn[3] = 2*u*Tn[3-1] - Tn[3-2];
-   Tn[4] = 2*u*Tn[4-1] - Tn[4-2];
-   Tn[5] = 2*u*Tn[5-1] - Tn[5-2];
-#ifdef USE_MAX_POW24_ACCURACY
-   Tn[6] = 2*u*Tn[6-1] - Tn[6-2];
-   Tn[7] = 2*u*Tn[7-1] - Tn[7-2];
-   Tn[8] = 2*u*Tn[8-1] - Tn[8-2];
-#endif
-
-   return iCn[0]*Tn[0] + iCn[1]*Tn[1] + iCn[2]*Tn[2] + iCn[3]*Tn[3] + iCn[4]*Tn[4] + iCn[5]*Tn[5]
-#ifdef USE_MAX_POW24_ACCURACY
-     + iCn[6]*Tn[6] + iCn[7]*Tn[7] + iCn[8]*Tn[8]
-#endif
-     ;
+  double y = init_newton (x, -1./5, 0.9953189663, 0.9594345146, 0.6742970332);
+  int i;
+  for (i = 0; i < 2; i++)
+    y = (1.+1./5)*y - ((1./5)*x*(y*y))*((y*y)*(y*y));
+  x *= y;
+  return x*x*x;
 }
 
-/* Precalculated (2^N) ^ (7 / 5) */
-static const double pow2_75[5] = {
-  1.0,
-  2.6390158215457883983,
-  6.9644045063689921093,
-  1.8379173679952558018e+1,
-  4.8502930128332728543e+1
-};
-
-/* Returns x^2.4 == x * x ^1.4 == x * x^(7/5) */
-double babl_pow_24 (double x)
+/* Returns x^(1/2.4) == x*(x^(-1/12))^7, using Newton's method for x^(-1/12).
+ * Max absolute error is 7e-8
+ * One more iteration of Newton would bring error down to 4e-14.
+ */
+double
+babl_pow_1_24 (double x)
 {
-   double s;
-   int iexp;
-   div_t qr;
-
-   s = frexp (x, &iexp);
-   s *= 2.0;
-   iexp -= 1;
-
-   qr = div (iexp, 5);
-   if (qr.rem < 0) {
-      qr.quot -= 1;
-      qr.rem += 5;
-   }
-
-   return x * ldexp (pow75norm (s) * pow2_75[qr.rem], 7 * qr.quot);
+  double y = init_newton (x, -1./12, 0.9976740658, 0.9885035151, 0.5907708377);
+  int i;
+  for (i = 0; i < 2; i++)
+    {
+      double z = (y*y)*(y*y);
+      z = (y*z)*(z*z);
+      y = (1.+1./12)*y - (1./12)*x*z;
+    }
+  return ((x*y)*(y*y))*((y*y)*(y*y));
 }
-
-- 
2.30.2